from sqlalchemy import create_engine
import pandas as pd
import pandas.io.sql as sqlio
import psycopg2
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime as dt
import seaborn as sns
from ast import literal_eval
#credentials anonymized
engine= create_engine('postgresql+psycopg2://postgres:pw@hostname:port/db_name')
df = pd.read_csv("Data/churn.csv")
df.columns = df.columns.str.lower()
df['first_paid_date'] = pd.to_datetime(df['first_paid_date'])
df['current_month'] = pd.to_datetime(df['month'])
df['current_month'] = df['current_month'].dt.strftime("%Y-%m")
df['cohort_month'] = df['first_paid_date'].dt.strftime("%Y-%m")
df = df.round({'throughput':0})
df['mtu'] = df['mtu'].fillna(0)
df['source_types']=df['source_types'].fillna('')
df['counts_by_source_type']=df['counts_by_source_type'].fillna('')
df['destination_names']=df['destination_names'].fillna('')
df['counts_by_destination_name']=df['counts_by_destination_name'].fillna('')
df['server_side_integration_calls'] = df['server_side_integration_calls'].fillna(0)
def literal_return(val):
try:
return literal_eval(val)
except (ValueError, SyntaxError) as e:
return val
df['source_types'] = df['source_types'].apply(literal_return)
df['counts_by_source_type']=df['counts_by_source_type'].apply(literal_return)
df['destination_categories']=df['destination_categories'].apply(literal_return)
df['destination_categories_counts']=df['destination_categories_counts'].apply(literal_return)
df['destination_names']=df['destination_names'].apply(literal_return)
df['counts_by_destination_name']=df['counts_by_destination_name'].apply(literal_return)
df['total_dest_cat_count'] = (
df.destination_categories_counts.apply(lambda x: sum([int(e) for e in x]) if type(x) == list else 0)
)
df['total_sources_count'] = (
df.counts_by_source_type.apply(lambda x: sum([int(e) for e in x]) if type(x) == list else 0)
)
df['total_dest_count'] = (
df.counts_by_destination_name.apply(lambda x: sum([int(e) for e in x]) if type(x) == list else 0)
)
#sql manipulations after loading original churn data in to a postgres database table called churn.
sql_query=("""
drop table if exists churn_info;
create table churn_info as
with max_month as
(select u_customer_id,max(months_since_start) as max_month_available
from churn
group by u_customer_id),
first_month as
(select u_customer_id,payment_plan,first_paid_date
from churn
where months_since_start = 0),
first_churn as
(select u_customer_id,min(months_since_start) as first_churn
from churn
where paying = 'FALSE'
group by u_customer_id),
customer_info as
(select t1.*,
t3.payment_plan as payment_plan_started,
t3.first_paid_date as cohort_month,
t2.paying as payment_status_end,
t2.payment_plan as payment_plan_at_end
from
max_month t1, churn t2, first_month t3
where t1.u_customer_id = t2.u_customer_id
and t1.max_month_available = t2.months_since_start
and t1.u_customer_id = t3.u_customer_id)
select t1.*, t4.first_churn as first_churn_month
from customer_info t1
left join first_churn t4
on t1.u_customer_id = t4.u_customer_id ;
alter table churn_info add column payment_plan varchar(100);
update churn_info
set payment_plan = payment_plan_at_end
where payment_plan_at_end = payment_plan_started;
update churn_info
set payment_plan = concat(payment_plan_started,'-',payment_plan_at_end)
where payment_plan_at_end != payment_plan_started;
alter table churn_info
add column prev_churn_status varchar(100),
add column final_churn_status varchar(100);
update churn_info
set final_churn_status = null;
update churn_info
set final_churn_status = 'churned'
where payment_status_end = 'FALSE';
update churn_info
set final_churn_status = 'retained'
where payment_status_end = 'TRUE';
update churn_info set prev_churn_status = null;
update churn_info
set prev_churn_status = 'never_churned'
where first_churn_month is null;
update churn_info
set prev_churn_status = 'reactivated'
where first_churn_month is not null;
update churn_info
set prev_churn_status = 'first_churn'
where payment_status_end = 'FALSE'
and first_churn_month = max_month_available;""",engine)
customer_info = pd.read_sql("""
with churn_times as
(select u_customer_id,count(months_since_start) no_of_churns
from churn
where paying = 'FALSE'
group by u_customer_id)
select t1.*, t2.no_of_churns
from churn_info t1
left join churn_times t2
on t1.u_customer_id = t2.u_customer_id;""",engine)
customer_info = customer_info.fillna(0)
customer_info['tenure'] = np.where(customer_info['prev_churn_status'] == 'reactivated',
customer_info['first_churn_month'],customer_info['max_month_available'])
final_churn = customer_info.groupby(['payment_plan','final_churn_status']).agg({'u_customer_id':'count'}).reset_index()
final_churn['percent'] = final_churn['u_customer_id'] / final_churn.groupby('payment_plan')['u_customer_id'].transform('sum')
final_churn['total_per_plan'] = final_churn.groupby('payment_plan')['u_customer_id'].transform('sum')
final_churn['percent_per_plan'] = final_churn['total_per_plan'] / final_churn['u_customer_id'].sum()
final_churn.round({'percent':2,'percent_per_plan':2})
px.bar(final_churn,x='payment_plan',y='percent',barmode='group',title='Customers churn & retention in each payment plan',
labels={'percent':'% of customers'},height=400,width=700,
color='final_churn_status',text='final_churn_status')
churn = customer_info.groupby(['payment_plan','prev_churn_status','final_churn_status']).agg({'u_customer_id':'count'}).reset_index()
churn.rename(columns={'u_customer_id':'customer_count'})
cohort = pd.read_sql("""
with t1 as
(select u_customer_id, first_paid_date,months_since_start
from churn where u_customer_id in
(select u_customer_id from churn_info ci where prev_churn_status = 'never_churned')
union
select t1.u_customer_id, first_paid_date,months_since_start
from churn t1, churn_info t2
where t1.u_customer_id = t2.u_customer_id
and t1.months_since_start < t2.first_churn_month
and t2.prev_churn_status != 'never_churned')
select first_paid_date,months_since_start,count(u_customer_id)
from t1
group by 1, 2""",engine)
#cohort = cohort.groupby(['first_paid_date','months_since_start']).agg(customer_count=('u_customer_id','count')).reset_index()
cohort = cohort.pivot(index='first_paid_date',columns='months_since_start',values='count')
retention = cohort.divide(cohort.iloc[:,0],axis=0)
retention = round(retention * 100)
plt.figure(figsize=(16,10))
sns.heatmap(retention.iloc[:, 0:21],annot=True,fmt='g')
plt.ylabel('cohort month')
df = df.merge(customer_info[['u_customer_id','final_churn_status','prev_churn_status','no_of_churns','payment_plan_at_end','tenure']],on=['u_customer_id'],how='inner')
df.groupby(['payment_plan_at_end','prev_churn_status'])['mtu'].describe().round(0)
df.groupby(['payment_plan_at_end','final_churn_status'])['throughput'].describe().round(0)
df.groupby(['payment_plan_at_end','final_churn_status'])['all_integrations_count'].describe().round(0)
df.groupby(['payment_plan_at_end','final_churn_status'])['active_integrations_count'].describe().round(0)
df.groupby(['payment_plan_at_end','final_churn_status'])['total_events'].describe().round(0)
df.groupby(['payment_plan_at_end','final_churn_status'])['active_users'].describe().round(0)
df.groupby(['payment_plan_at_end','final_churn_status'])['warehouse_rows_synced'].describe().round(0)
df.groupby(['payment_plan_at_end','prev_churn_status'])['all_integrations_count'].describe().round(0)
usage = df.groupby(['u_customer_id','prev_churn_status','final_churn_status','payment_plan_at_end'])[['all_integrations_count','active_users','total_events','mtu','throughput','all_sources_count']].median().reset_index()
usage = usage.merge(customer_info[['u_customer_id','tenure']],on='u_customer_id',how='inner')
fig = px.box(usage[usage['mtu']<300000],x='payment_plan_at_end',y='mtu',points='all',color='final_churn_status',
title='MTU by Payment Plan',height=500,width=800)
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01
))
fig.show()
usage[usage['u_customer_id'].isin(['oycurahdsuuc','pcjprrldioxl','ejbnkfpdixdv'])]
conditions = [
(usage['mtu'] <= 10000),
(usage['mtu'] > 10000) & (usage['mtu'] <= 25000),
(usage['mtu'] > 25000) & (usage['mtu'] <= 100000),
(usage['mtu'] > 100000)]
values = ['-10k','10k-25k', '25k-100k','100k+']
usage['mtu_cat'] = np.select(conditions, values)
usage_mtu = usage.groupby(['mtu_cat','payment_plan_at_end','final_churn_status']).agg({'u_customer_id':'count'}).reset_index()
fig = px.box(usage[usage['all_integrations_count']<100],x='mtu_cat',y='all_integrations_count',
color='final_churn_status',facet_col='payment_plan_at_end',
labels={'mtu_cat':'MTU category'},
title = 'How complex integrations affect retention and churn?')
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01
))
#fig.update_traces(boxpoints=False)
fig.show()
px.box(usage[usage['all_sources_count']<50],x='mtu_cat',y='all_sources_count',color='final_churn_status',facet_col='payment_plan_at_end')
#px.scatter(df[df['api_calls']<100000000],x='months_since_start',y='api_calls',facet_row='prev_churn_status')
fig = px.scatter(df[(df['prev_churn_status']=='reactivated') & (df['api_calls']<30000000)],
x='months_since_start',y='api_calls',facet_row='final_churn_status',
title = "Volume of API Calls of Reactivated Customers")
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
fig.update_yaxes(title='')
fig.update_traces(marker=dict(color='green'))
fig = px.box(df[df['server_side_integration_calls']<500000],y='server_side_integration_calls',
x='months_since_start',facet_row='final_churn_status',
title = 'Distribution of Server Side Integration Calls Over Customer Tenure')
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
fig.update_yaxes(title='')
#fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', yaxis_title=None)
px.box(df[df['anonymous_users']<100000],y='anonymous_users',x='months_since_start',facet_row='final_churn_status')
fig = px.box(df[df['total_dest_cat_count']<=50],y='total_dest_cat_count',x='months_since_start',
facet_col='final_churn_status',
labels={'total_dest_cat_count':'Total Destination Categories Count',
'months_since_start':'Month Since Start'})
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
fig.update_yaxes(title='')
fig.show()
avg_tenure = pd.read_sql("""with t1 as
(select payment_plan_at_end,final_churn_status,prev_churn_status ,u_customer_id,max_month_available
from churn_info
where prev_churn_status != 'reactivated'
union
select payment_plan_at_end,final_churn_status,prev_churn_status,u_customer_id,first_churn_month
from churn_info ci
where prev_churn_status = 'reactivated')
select payment_plan_at_end,final_churn_status,prev_churn_status,
case
when prev_churn_status = 'first_churn' and final_churn_status = 'churned' then final_churn_status
when prev_churn_status = 'never_churned' and final_churn_status = 'retained' then final_churn_status
when prev_churn_status = 'reactivated' and final_churn_status = 'churned' then concat(prev_churn_status,'-',final_churn_status)
else concat(prev_churn_status,'-',final_churn_status)
end as churn_status,
round(avg(max_month_available),0) as avg from t1
group by payment_plan_at_end,final_churn_status,prev_churn_status;;""",engine)
avg_tenure
fig = px.bar(avg_tenure,x='churn_status',y='avg',barmode='group',
facet_col='payment_plan_at_end',text='avg',labels={'avg':'Avg Tenure'})
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
fig.update_xaxes(title='')